See https://github.com/GaryNapier/ggtree for this markdown code and data
library(ggplot2) # ggtree is based on ggplot and uses a lot of its code
library(ggtree) # ggtree
library(treeio) # Loads up Beast trees
library(ggnewscale) # For adding new heatmaps (annotations)
library(scales) # For colours
library(gplots) # Also for colours
# Read in beast tree with treeio package
mcc_tree <- treeio::read.beast(mcc_tree_file)
metadata <- read.csv(metadata_file)
beast_clusters <- read.csv(beast_clusters_file, header = F, col.names = c("cluster", "id"))
mcc_tree
## 'treedata' S4 object that stored information of
## 'beast_results/PAKISTAN_ALL.mcc.tree'.
##
## ...@ phylo:
## Phylogenetic tree with 122 tips and 121 internal nodes.
##
## Tip labels:
## ERR3335723_2016, ERR3335724_2016, ERR3335725_2016, ERR3335726_2016, ERR3335727_2016, ERR3335728_2016, ...
##
## Rooted; includes branch lengths.
##
## with the following features available:
## 'CAheight_0.95_HPD', 'CAheight_mean', 'CAheight_median', 'CAheight_range',
## 'height', 'height_0.95_HPD', 'height_median', 'height_range', 'length',
## 'length_0.95_HPD', 'length_median', 'length_range', 'posterior'.
There's lots of metadata here that can be used in the trees (the "with the following features available:" bit)
# I think this is how you get the first (estimated) date of the Beast tree - i.e. the root.
# My root year happens to be 0, but if the root date was different, then I suspect the root.edge would be different
# (i.e. root.edge is not actually in years)
first_date <- mcc_tree@phylo$root.edge
# Get the last overall date from the sample names - year after the last underscore
last_date <- as.numeric(max(unlist(lapply(strsplit(mcc_tree@phylo$tip.label, "_"),
function(x){x[length(x)]}))))
# In ggtree you have to give the last date as a character string in the form of YYYY-MM-DD
last_date_chr <- paste0(as.character(last_date), "-12-31")
# Total n samples
n_samps <- length(mcc_tree@phylo$tip.label)
Set up params/values for the trees:
# Expand the x-axis so there's a bit of room at the start and a very large space (80% of the full timeline)
# to the right for the annotations
x_lim <- c(first_date-10, last_date+(last_date*0.8))
# Expand the y-axis so there's room at the top and bottom
y_lim <- c(-5, n_samps + (n_samps * 0.1))
# Define vertical line colour
v_line_col <- "red"
# Define sequence of year ticks on the x-axis
x_labs_full <- seq(first_date, last_date, by = 100)
# Angle of the labels so they fit in
angle <- 45
ggtree(mcc_tree, mrsd = last_date_chr)
ggtree(mcc_tree, mrsd = last_date_chr) +
theme_tree2()
Code for the tip labels is shown, but commented
ggtree(mcc_tree, mrsd = last_date_chr) +
theme_tree2() +
# Tip labels
# geom_tiplab(align=TRUE, linetype='dashed', linesize=.3, size = 2) +
# Red confidence bars
geom_range("length_0.95_HPD", color='red', size=2, alpha=.5) +
# Text labels for branch posterior scores
# Note - you can filter the values - here only 0.9 and above are shown.
# Also can control the position with the vjust arg.
geom_text2(aes(label=round(as.numeric(posterior), 2),
subset=as.numeric(posterior)> 0.9,
x=branch), vjust=0) +
# Set the y-axis as defined above
coord_cartesian(ylim = y_lim) +
# Set the x-axis limits, and year tick labels (don't know why you need 'breaks' as well as 'labels')
scale_x_continuous(limits = x_lim, breaks = x_labs_full, labels = x_labs_full)+
# Set the appearance of the year labels, including angle
theme(axis.text.x = element_text(face = "bold", size = 8, angle = 45, hjust = 1))
# Add id/year hybrid col so samples names are correct - i.e. they match the Beast tree sample names
metadata$id_year <- paste0(metadata$wgs_id, "_", metadata$year)
rownames(metadata) <- metadata$id_year
# Subset metadata
# FORMAT: one col per attribute, rownames of dataframe match names of samples in the tree
lin_data <- metadata[, "main_lineage", drop = F]
dr_status_data <- metadata[, "dr_status", drop = F]
# Convert lin and DR data to factors
lin_data <- data.frame(apply(lin_data, 2, as.factor))
# Change column names for neatness on tree
colnames(dr_status_data) <- "DR status"
colnames(lin_data) <- "Lineage"
Drug resistance data now looks like this. Note - the sample names are the rownames of the one-column dataframe. Also note - some sample names are "XXXX_NA". This is because they've been pulled from the metadata and not all samples in the metadata have years assigned to them and therefore weren't used to create the tree. ggtree will find the ones that are in both the metadata and the tree, so it doesn't matter if you have extra samples in your metadata.
head(dr_status_data)
## DR status
## ERR688030_NA XDR
## ERR688043_NA XDR
## ERR1213917_NA Sensitive
## ERR2510632_NA Pre-MDR
## ERR2510285_NA MDR
## ERR2510586_NA MDR
Lineage data is very similar:
head(lin_data)
## Lineage
## ERR688030_NA 4
## ERR688043_NA 4
## ERR1213917_NA 3
## ERR2510632_NA 3
## ERR2510285_NA 3
## ERR2510586_NA 3
# Define cols for each dataset - one colour per unique value of the datasets
alpha <- 0.9
lin_colours <- rainbow(length(unique(metadata$main_lineage)), alpha = alpha)
dr_status_colours <- scales::alpha(gplots::col2hex(c("green1", "yellow2", "orange1", "red1", "black", "grey")), alpha = alpha)
# Add names (unique values for each dataset) to dataframes.
names(lin_colours) <- c(sort(unique(metadata$main_lineage)))
names(dr_status_colours) <- c("Sensitive", "Pre-MDR", "MDR", "Pre-XDR", "XDR", "Other")
Colour vectors:
lin_colours
## 1 2 3 4
## "#FF0000E6" "#80FF00E6" "#00FFFFE6" "#8000FFE6"
dr_status_colours
## Sensitive Pre-MDR MDR Pre-XDR XDR Other
## "#00FF00E6" "#EEEE00E6" "#FFA500E6" "#FF0000E6" "#000000E6" "#BEBEBEE6"
Parameters for the strip annotations
# Where the annotations are placed (in this case 50 years to the right of the last date)
offset <- 50
# Width of the heatmap strip
width <- 0.05
To add the heatmap/strip annotation, draw the same tree as above and save as an object, then pass to gheatmap()
gg_mcmc_tree <- ggtree(mcc_tree, mrsd = last_date_chr)+
theme_tree2() +
# geom_tiplab(align=TRUE, linetype='dashed', linesize=.3, size = 2) +
geom_range("length_0.95_HPD", color='red', size=2, alpha=.5) +
geom_text2(aes(label=round(as.numeric(posterior), 2),
subset=as.numeric(posterior)> 0.9,
x=branch), vjust=0) +
coord_cartesian(ylim = y_lim) +
scale_x_continuous(breaks = x_labs_full, labels = x_labs_full, limits = x_lim)+
theme(axis.text.x = element_text(face = "bold", size = 8, angle = 45, hjust = 1))
# Add lineage data
lin_hm <- gheatmap(gg_mcmc_tree, lin_data,
width = width,
offset = offset,
# Not sure what the color arg does or why it has to be NULL. Rest of the args are self-explanatory
color = NULL,
colnames_position = "top",
colnames_angle = angle,
colnames_offset_y = 1,
hjust = 0,
font.size = 3) +
# Add the custom colours defined above
scale_fill_manual(values = lin_colours, breaks = names(lin_colours) )+
# Define the legend title
labs(fill = "Lineage")
lin_hm
Note - the scale_fill_manual() function took ages to work out - the values are the colours themseves, i.e. #FF0000E6, #80FF00E6, #00FFFFE6, #8000FFE6, and the breaks is the unique values in the data (i.e. 1, 2, 3, 4), which we've used to name the colour vector.
To do this we have to add the saved tree to the new_scale_fill() function from the ggnewscale package # See "7.3.1 Visualize tree with multiple associated matrix" - https://yulab-smu.top/treedata-book/chapter7.html
It is possible to just have a dataset with two columns (i.e. in this case the lineage and DR data) and to concattenate the colours together, but this would mean there's only one legend and the whole thing is less clear.
lin_hm <- lin_hm + ggnewscale::new_scale_fill()
Note - offset has to be increased otherwise the strip will be drawn over the top of the previous one. The rest of the code is the same except for the tree input, data input, colours and legend label
dr_status_hm <- gheatmap(lin_hm, dr_status_data,
width = width,
# Increase offset
offset = offset+100,
color = NULL,
colnames_position = "top",
colnames_angle = angle,
colnames_offset_y = 1,
hjust = 0,
font.size = 3) +
# Add colours for DR status
scale_fill_manual(values = dr_status_colours, breaks = names(dr_status_colours) )+
# New legend label
labs(fill = "DR\nstatus")
dr_status_hm
The last annotation to add is a binary dataset of individual drugs - 0/1 whether the sample is resistant to the drug
Get data and convert numeric 0/1 to factors
dr_data <- dplyr::select(metadata, rifampicin:delamanid)
dr_data <- data.frame(apply(dr_data, 2, as.factor))
Lots of columns with 0/1/NA
head(dr_data)
## rifampicin isoniazid ethambutol pyrazinamide streptomycin
## ERR688030_NA <NA> <NA> <NA> <NA> <NA>
## ERR688043_NA <NA> <NA> <NA> <NA> <NA>
## ERR1213917_NA 0 0 1 <NA> 0
## ERR2510632_NA 0 1 0 0 <NA>
## ERR2510285_NA 1 1 0 1 <NA>
## ERR2510586_NA 1 1 0 1 <NA>
## ofloxacin moxifloxacin levofloxacin amikacin kanamycin
## ERR688030_NA <NA> <NA> <NA> <NA> <NA>
## ERR688043_NA <NA> <NA> <NA> <NA> <NA>
## ERR1213917_NA <NA> <NA> <NA> <NA> <NA>
## ERR2510632_NA <NA> <NA> <NA> <NA> <NA>
## ERR2510285_NA <NA> <NA> <NA> <NA> <NA>
## ERR2510586_NA <NA> <NA> <NA> <NA> <NA>
## capreomycin ciprofloxacin prothionamide ethionamide
## ERR688030_NA <NA> <NA> <NA> <NA>
## ERR688043_NA <NA> <NA> <NA> <NA>
## ERR1213917_NA <NA> <NA> <NA> <NA>
## ERR2510632_NA <NA> <NA> <NA> <NA>
## ERR2510285_NA <NA> <NA> <NA> <NA>
## ERR2510586_NA <NA> <NA> <NA> <NA>
## clarithromycin clofazimine bedaquiline cycloserine linezolid
## ERR688030_NA <NA> <NA> <NA> <NA> <NA>
## ERR688043_NA <NA> <NA> <NA> <NA> <NA>
## ERR1213917_NA <NA> <NA> <NA> <NA> <NA>
## ERR2510632_NA <NA> <NA> <NA> <NA> <NA>
## ERR2510285_NA <NA> <NA> <NA> <NA> <NA>
## ERR2510586_NA <NA> <NA> <NA> <NA> <NA>
## para_aminosalicylic_acid rifabutin delamanid
## ERR688030_NA <NA> <NA> <NA>
## ERR688043_NA <NA> <NA> <NA>
## ERR1213917_NA <NA> <NA> <NA>
## ERR2510632_NA <NA> <NA> <NA>
## ERR2510285_NA <NA> <NA> <NA>
## ERR2510586_NA <NA> <NA> <NA>
Add the tree to new_scale_fill() again:
dr_status_hm <- dr_status_hm + ggnewscale::new_scale_fill()
This time color is changed to "black" I think as the 'base' colour, and low and high added. I think these assign the 0 value to white and the 1 to black. NA is defined in the scale_fill_manual() function.
This time in scale_fill_manual(), instead of breaks, for some reason labels is used to map the 0/1 to the legend labels.
gheatmap(dr_status_hm, dr_data,
# Increase offset
offset = offset+200,
width = width+0.5,
# Change color to black
# color = NULL,
color="black",
low="white",
high="black",
colnames_position = "top",
colnames_angle = angle,
colnames_offset_y = 1,
hjust = 0,
font.size = 2.5) +
# Define colours
scale_fill_manual(values=c("white", "black"), labels = c("Sensitive", "Resistant", "NA"), na.value = "grey")+
labs(fill = "Drug\nresistance")
The drug binary data looks a bit squashed on this webpage but I'll show how to save to pdf later.
Actually the whole tree is a bit squashed at the tips because the samples were taken from possible transmission cases, but are also from different lineages so have MRCA a long time ago. So it might be useful to zoom in on the tips.
If we specify a year to limit the x-axis, then only those clades that have MRCA after this year are drawn, which might be useful for zooming in on clusters.
Zoom tree setup
The code is basically the same, but the x-axis limits are changed. Note - the offset and width have to be much smaller.
I've also added a vertical red line to show when the last date was.
zoom_range <- c((last_date+1) - 50, (last_date + 1) + 50)
zoom_range_seq <- seq(zoom_range[1], (last_date+1), 2)
offset_zoom <- 0
width_zoom <- 0.001
gg_mcmc_tree_zoom <- ggtree(mcc_tree, mrsd = last_date_chr) +
theme_tree2() +
coord_cartesian(ylim = y_lim) +
scale_x_continuous(breaks = zoom_range_seq,
labels = zoom_range_seq,
limits = zoom_range) +
# Add vertical line
geom_vline(aes(xintercept = 2020), col = "red")+
theme(axis.text.x = element_text(face = "bold", size = 8, angle = 45, hjust = 1))
# Add lin data
lin_hm <- gheatmap(gg_mcmc_tree_zoom, lin_data,
width = width_zoom,
offset = offset_zoom,
color = NULL,
colnames_position = "top",
colnames_angle = angle, colnames_offset_y = 1,
hjust = 0,
font.size = 3) +
scale_fill_manual(values = lin_colours, breaks = names(lin_colours) )+
labs(fill = "Lineage")
lin_hm <- lin_hm + ggnewscale::new_scale_fill()
# Add DR status
dr_status_hm <- gheatmap(lin_hm, dr_status_data,
width = width_zoom,
offset = offset_zoom + 2,
color = NULL,
colnames_position = "top",
colnames_angle = angle, colnames_offset_y = 1,
hjust = 0,
font.size = 3) +
scale_fill_manual(values = dr_status_colours, breaks = names(dr_status_colours) )+
labs(fill = "DR\nstatus")
# Add DR individual status
dr_status_hm <- dr_status_hm + ggnewscale::new_scale_fill()
gheatmap(dr_status_hm, dr_data,
offset = offset_zoom + 4,
width = width_zoom + 0.02,
# color = NULL,
low="white", high="black", color="black",
colnames_position = "top",
colnames_angle = angle,
colnames_offset_y = 1,
hjust = 0,
font.size = 2.5,
legend_title = "llw") +
scale_fill_manual(values=c("white", "black"), labels = c("Sensitive", "Resistant", "NA"), na.value = "grey")+
labs(fill = "Drug\nresistance")
If we want to plot these individual clusters it's possible to loop over lists of samples, subset the tree, and run the ggtree code
This is the list of samples and their groups - all the clusters that result from cutting the tree at 50 years before the last sample. This file was created with my cut_tree.py function.
head(beast_clusters)
## cluster id
## 1 1 ERR3335724_2016
## 2 1 ERR3335759_2016
## 3 2 ERR3335726_2016
## 4 2 ERR3335728_2016
## 5 2 taj_pakistan_05_2017
## 6 2 taj_pakistan_11_2017
Split the file into each cluster to loop over:
clusters_split <- split(beast_clusters, beast_clusters$cluster)
head(clusters_split)
## $`1`
## cluster id
## 1 1 ERR3335724_2016
## 2 1 ERR3335759_2016
##
## $`2`
## cluster id
## 3 2 ERR3335726_2016
## 4 2 ERR3335728_2016
## 5 2 taj_pakistan_05_2017
## 6 2 taj_pakistan_11_2017
## 7 2 taj_pakistan_57_2019
##
## $`3`
## cluster id
## 8 3 ERR3335789_2017
## 9 3 ERR3335792_2017
## 10 3 ERR3335799_2017
## 11 3 taj_pakistan_09_2017
## 12 3 taj_pakistan_44_2017
##
## $`4`
## cluster id
## 13 4 ERR3335765_2016
## 14 4 ERR3335778_2016
## 15 4 ERR3335782_2017
##
## $`5`
## cluster id
## 16 5 ERR3335752_2016
## 17 5 ERR3335797_2017
##
## $`6`
## cluster id
## 18 6 ERR3335756_2016
## 19 6 ERR3335764_2016
## 20 6 ERR3335758_2016
Notes:
Usually I'd save these as a pdf (one plot per page, and one page per loop) - the relevent code is pdf(file = <filename>) at the start of the loop to open the pdf file, print(<plot_name>) during the loop, and dev.off() at the end to finish saving it. These lines are commented here, except the print(<plot_name>) so I can show the plots here.
There's only one function in treeio to subset trees, which is treeio::drop.tip, so I have to first get the samples that aren't in the cluster of interest with %in%
I use a new way here of getting first and last dates by creating the basic tree first - the theme_tree2() obviously does some calculations on the edge lengths to get the dates, but I haven't pulled out the function, rather I just run it and get the dates.
Here I program the x-axis scale and labels according to the order of magnitude (oom) of the date range. If I just leave it to the defaults it's a bit a of a mess. The log10_ceiling() function determines the oom.
I also have to determine the x-axis ranges and the space to the right of the tree for the annotations by taking proportions of the date range - so if a cluster's MRCA stretches back 50 years compared to 5 years, the spaces can get too squashed or stretched, so evserything's done as percentages of the time span. This also applies to the offsets and widths, which are dependent on the x-axis.
Similarly, I also have to change the y-axis space because the number of samples in each cluster changes a lot.
# Get sample names from whole tree
mcc_tree_tips <- mcc_tree@phylo$tip.label
# pdf(file = beast_clusters_pdf_file)
for(i in seq(clusters_split)){
# Get the names of samples from cluster i
names_to_keep <- clusters_split[[i]]$id
# Get the names from the full Beast tree that aren't these names
names_to_drop <- mcc_tree_tips[!(mcc_tree_tips %in% names_to_keep)]
# Create new sub tree
clust_tree <- treeio::drop.tip(mcc_tree, names_to_drop)
# Do tree first to get first and last date and ranges
last_date_clust <- as.numeric(max(unlist(lapply(strsplit(clust_tree@phylo$tip.label, "_"), function(x){x[length(x)]}))))
last_date_clust_chr <- paste0(last_date_clust-1, "-12-31")
gg_clust_tree <- ggtree(clust_tree, mrsd = last_date_clust_chr) +
theme_tree2()
last_date_clust <- ceiling(max(gg_clust_tree$data$x))
first_date_clust <- floor(min(gg_clust_tree$data$x))
date_range_clust <- c(first_date_clust, last_date_clust)
year_span_clust <- diff(date_range_clust)
# N samples in cluster i
n_samps <- length(clust_tree@phylo$tip.label)
# Get order of magnitude of the date range
oom <- log10_ceiling(year_span_clust)
if(oom == 1){
by <- 1
}else if(oom == 10){
by <- 2
}else if(oom == 100){
by <- 10
}else{
by <- 100
}
# Create seq of dates for x-axis
zoom_range_seq_clust <- seq(first_date_clust,
last_date_clust,
by = by)
# Determine space to right of tree for annotations as a multiple of total time range
post_tree_multiply <- 5
post_tree_span <- (year_span_clust * post_tree_multiply)
x_lim_clust <- c(first_date_clust - (year_span_clust*0.2), last_date_clust + post_tree_span)
# Same for y-axis according to number of samples
y_lim_clust <- c(0 - floor(n_samps*0.1), n_samps + ceiling(n_samps*0.2) )
# Sort out the offsets and widths
width_clust <- 0.035 * post_tree_multiply # % of tree time span
offset_clust <- (width_clust * year_span_clust)
# Make tree
gg_clust_tree <- ggtree(clust_tree, mrsd = last_date_clust_chr) %<+% metadata +
theme_tree2() +
coord_cartesian(ylim = y_lim_clust) +
scale_x_continuous(breaks = zoom_range_seq_clust,
labels = zoom_range_seq_clust,
limits = x_lim_clust) +
geom_vline(aes(xintercept = last_date_clust), col = "red")+
theme(axis.text.x = element_text(face = "bold", size = 8, angle = 45, hjust = 1))
# Add lin data
lin_hm_clust <- gheatmap(gg_clust_tree, lin_data,
width = width_clust,
offset = 0,
color = NULL,
colnames_position = "top",
colnames_angle = angle,
# colnames_offset_y = 1,
hjust = 0,
font.size = 3) +
scale_fill_manual(values = lin_colours, breaks = names(lin_colours) )+
labs(fill = "Lineage")
lin_hm_clust <- lin_hm_clust + ggnewscale::new_scale_fill()
# Add DR status clusters
dr_status_hm_clust <- gheatmap(lin_hm_clust, dr_status_data,
width = width_clust,
offset = offset_clust,
color = NULL,
colnames_position = "top",
colnames_angle = angle,
# colnames_offset_y = 1,
hjust = 0,
font.size = 3) +
scale_fill_manual(values = dr_status_colours, breaks = names(dr_status_colours) )+
labs(fill = "DR\nstatus")
dr_status_hm_clust <- dr_status_hm_clust + ggnewscale::new_scale_fill()
final_plot <- gheatmap(dr_status_hm_clust, dr_data,
width = width_clust * ncol(dr_data),
# offset = ceiling(offset_clust * 3),
offset = (offset_clust * 2) + (offset_clust * 0.2),
# color = NULL,
low="white", high="black", color="black",
colnames_position = "top",
colnames_angle = angle,
colnames_offset_y = 0,
hjust = 0,
font.size = 2.5,
legend_title = "llw") +
scale_fill_manual(values=c("white", "black"), labels = c("Sensitive", "Resistant", "NA"), na.value = "grey")+
labs(fill = "Drug\nresistance")
print(final_plot)
}
# dev.off()